import zipfile
import re
from csv import writer
import jieba

# 定义压缩包路径
zip_path =r'C:\Users\王相辰\Desktop\python上课练习\books.zip'

# 打开压缩包
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # 遍历压缩包中的所有文件
    for file_info in zip_ref.infolist():
        # 检查文件是否为 .txt 文件
        if file_info.filename.endswith('.txt'):
            # 读取文件内容
            with zip_ref.open(file_info) as file:
                content = file.read().decode('ANSI')
            
            # 提取中文和英文内容
            chinese_content = ''.join(re.findall(r'[\u4e00-\u9fa5]+', content))
            english_content = ' '.join(re.findall(r'\b[a-zA-Z]+(?:\'[a-zA-Z]+)?\b', content))
            
            # 保存提取的中文和英文内容到单独文件
            chinese_file_path = file_info.filename.replace('.txt', '.Chinese.txt')
            english_file_path = file_info.filename.replace('.txt', '.English.txt')
            
            with open(chinese_file_path, 'w', encoding='utf-8') as chinese_file:
                chinese_file.write(chinese_content)
            
            with open(english_file_path, 'w', encoding='utf-8') as english_file:
                english_file.write(english_content)
            
            # 使用 jieba 进行中文分词
            chinese_words = jieba.lcut(chinese_content)
            
            # 统计中文词频
            chinese_counts = {}
            for word in chinese_words:
                chinese_counts[word] = chinese_counts.get(word, 0) + 1
            
            # 统计英文词频
            english_words = re.findall(r'\b[a-zA-Z]+(?:\'[a-zA-Z]+)?\b', content)
            english_counts = {}
            for word in english_words:
                word_lower = word.lower()
                english_counts[word_lower] = english_counts.get(word_lower, 0) + 1

            # 保存统计结果到 CSV 文件
            save_path = file_info.filename.replace('.txt', '.csv')
            with open(save_path, "w", encoding='utf-8', newline='') as f:
                csv_writer = writer(f)
                csv_writer.writerow(['Chinese Word', 'Chinese Count', 'English Word', 'English Count'])

                max_length = max(len(chinese_counts), len(english_counts))
                chinese_items = list(chinese_counts.items())
                english_items = list(english_counts.items())
                for i in range(max_length):
                    chinese_item = chinese_items[i] if i < len(chinese_items) else ('', 0)
                    english_item = english_items[i] if i < len(english_items) else ('', 0)
                    csv_writer.writerow([chinese_item[0], chinese_item[1], english_item[0], english_item[1]])
